In [1]:
import pandas as pd
import numpy as np
import pymysql
import pymongo
import re
import jieba
import logging
In [2]:
import plotly
import plotly.plotly as py
from plotly.offline import init_notebook_mode
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()
In [3]:
mongo_client = pymongo.MongoClient("localhost", 27017)
In [4]:
db = mongo_client.db_huangsy
In [5]:
sz_rent_group = db.douban_sz_rent_group_v2
In [6]:
doc_cur = sz_rent_group.find({})
In [7]:
item_list = list()
for item in doc_cur:
    item_list.append(item)
    
df_rent = pd.DataFrame(item_list)
In [9]:
d1 = df_rent.loc[df_rent.room_rent != ""].sample(100)
d1.to_excel(r"D:\pandas data\test1.xlsx")
In [10]:
df_rent_nc = df_rent.drop(columns='content')
In [11]:
df_rent_nc['date'] = df_rent_nc['response_time'].apply(lambda x: x[0:10])
In [12]:
df_rent_nc['room_rent'] = df_rent_nc['room_rent'].apply(lambda x: float(x) if x != "" else np.nan)
df_rent_nc['girl_only'] = df_rent_nc['girl_only'].astype(np.int)
df_rent_nc['man_only'] = df_rent_nc['man_only'].astype(np.int)
df_rent_nc['entire_room_num'] = df_rent_nc['entire_room_num'].apply(lambda x: int(x) if x != "" else np.nan)
In [13]:
df_rent_plot = df_rent_nc.loc[df_rent_nc.post_year >= '2014']
In [14]:
from pyecharts import Bar, Line, Overlap, Page, WordCloud
from pyecharts import online
online()

豆瓣深圳租房团各区房租年走势

In [15]:
def graph_lines_rent(res_district_rent, width=960, height=400, **kwargs):
    title = kwargs.get("title", "深圳豆瓣租房团各区房租年走势")
    yaxis_name = kwargs.get("yaxis_name", "")
    yaxis_formatter = kwargs.get("yaxis_formatter", "")
    is_label_show = kwargs.get("is_label_show",True)
    xaxis_rotate = kwargs.get("xaxis_rotate", 0)
    color_list = ["#ff0000","#00ff00", "#00ffff",\
                   "#0000ff", "#ff00ff","#4a86e8",  "#7f6000"]
    line = Line(title, title_text_size = 20, title_pos="center", title_top="1%" ,width=width, height=height)
    for district in res_district_rent.keys():
        res =  res_district_rent[district]
        line.add(district, res['x'], res['y'], xaxis_rotate = xaxis_rotate, yaxis_formatter=yaxis_formatter, yaxis_name_gap=50, yaxis_name=yaxis_name, yaxis_label_textcolor="blue",\
                line_width=2, legend_top="8%", label_color=color_list, is_smooth=True, is_label_show=is_label_show)
    return line
In [16]:
def get_block_res(df, block="district", time_col="post_year"):
    block_list = df[block].unique()
    res_dict = {}
    for blk in block_list:
        data = {}
        df_blk = df.loc[df[block] == blk]
        x = df_blk[time_col].tolist()
        y = df_blk['room_rent'].tolist()
        data['x'] = x
        data['y'] = y
        res_dict[blk] = data
    return res_dict
In [17]:
def get_block_res(df, block="district", time_col="post_year", y_col='room_rent'):
    block_list = df[block].unique()
    res_dict = {}
    for blk in block_list:
        data = {}
        df_blk = df.loc[df[block] == blk]
        x = df_blk[time_col].tolist()
        y = df_blk[y_col].tolist()
        data['x'] = x
        data['y'] = y
        res_dict[blk] = data
    return res_dict
In [18]:
def filter_mean(x):
    xn = x[~np.isnan(x)]
    if len(xn) <= 6:
        return np.nan
    xa = np.array(xn)
    xa = np.sort(xa)
    xb = xa[3:-3]
    return int(np.mean(xb))

def filter_median(x):
    xn = x[~np.isnan(x)]
    if len(xn) <= 6:
        return np.nan
    xa = np.array(xn)
    xa = np.sort(xa)
    xb = xa[3:-3]
    return int(np.median(xb))
In [19]:
df_rent_plot = df_rent_plot.loc[~df_rent_plot.district.isin(['光明','坪山', '盐田', '大鹏'])]
In [20]:
district_rent = df_rent_plot.loc[(df_rent_plot.rent_type.isin(['招租','转租']))& (df_rent_plot.district != "")& (pd.notna(df_rent_plot.room_rent))]
district_rent_year = district_rent[['post_year', 'district', 'room_rent']].groupby(['post_year', 'district']).agg({'room_rent': filter_mean}).reset_index()

district_rent_year = district_rent_year.sort_values(by='post_year', ascending=True)
# district_rent_year['room_rent'] = district_rent_year['room_rent'].apply(lambda x: int(x))
In [21]:
district_list = ["南山","福田","宝安", "罗湖", "龙岗","龙华"]
In [22]:
data = []
#visible_list=[True, False, False, False, False,False]
layout = dict(  
         title = "2019年(截止0615)豆瓣深圳租房团各区价格分布",
         xaxis=dict(title="平均单房房租"),
         yaxis = dict(side = 'left', ticksuffix="%" ,zeroline = False)
        )
for i, district in enumerate(district_list):
    df = district_rent.loc[(district_rent.district == district) & (district_rent.post_year=='2019')]
    df = df.loc[df.room_rent < 5000]
    data.append(go.Histogram(x=df.room_rent, nbinsx=25, name=district,histnorm='percent',showlegend=True))
fig = dict(data=data,layout=layout)
plotly.offline.iplot(fig)
In [23]:
res_district_rent = get_block_res(district_rent_year, block='district', time_col='post_year')
In [24]:
line = graph_lines_rent(res_district_rent, **{'title': "豆瓣深圳租房团各区平均单房房租年走势"})
line
Out[24]:

2018年豆瓣深圳租房团各区房租月走势

In [25]:
district_rent = df_rent_plot.loc[(df_rent_plot.rent_type.isin(['招租','转租']))& (df_rent_plot.district != "")& (pd.notna(df_rent_plot.room_rent)) & (df_rent_plot.post_year=='2018')]
district_rent_ym = district_rent[['year_month', 'district', 'room_rent']].groupby(['year_month', 'district']).agg({'room_rent': filter_mean}).reset_index()

district_rent_ym = district_rent_ym.sort_values(by='year_month', ascending=True)
#district_rent_ym['room_rent'] = district_rent_ym['room_rent'].apply(lambda x: int(x))
In [26]:
res_district_rent = get_block_res(district_rent_ym, block='district', time_col='year_month')
In [27]:
line = graph_lines_rent(res_district_rent, **{'title': "2018年豆瓣深圳租房团各区房租月走势"})
line
Out[27]:

豆瓣深圳租房团各区年月求租贴子数

In [28]:
district_rent = df_rent_plot.loc[(df_rent_plot.rent_type.isin(['求租']))& (df_rent_plot.district != "")&(df_rent_plot.year_month!='2019-06')]
district_rent_ym = district_rent[['year_month', 'district', 'room_rent']].groupby(['year_month', 'district']).count().reset_index()

district_rent_ym = district_rent_ym.sort_values(by='year_month', ascending=True)
district_rent_ym['room_rent'] = district_rent_ym['room_rent'].apply(lambda x: int(x))

res_district_rent = get_block_res(district_rent_ym, block='district', time_col='year_month')
In [29]:
line = graph_lines_rent(res_district_rent, **{'title': "豆瓣深圳租房团各区年月求租贴子数",'is_label_show': False, 'xaxis_rotate':30})
line
Out[29]:

豆瓣深圳租房团各区年月招转租贴子数

In [30]:
district_rent = df_rent_plot.loc[(df_rent_plot.rent_type.isin(['招租','转租']))& (df_rent_plot.district != "")&(df_rent_plot.year_month!='2019-06')]
district_rent_ym = district_rent[['year_month', 'district', 'room_rent']].groupby(['year_month', 'district']).count().reset_index()

district_rent_ym = district_rent_ym.sort_values(by='year_month', ascending=True)
district_rent_ym['room_rent'] = district_rent_ym['room_rent'].apply(lambda x: int(x))

res_district_rent = get_block_res(district_rent_ym, block='district', time_col='year_month')
In [31]:
line = graph_lines_rent(res_district_rent,**{'title': "豆瓣深圳租房团各区年月招转租贴子数",'is_label_show': False, 'xaxis_rotate':30})
line
Out[31]:

豆瓣深圳租房团2019年各区贴子数租寻占比

In [32]:
district_rent_rate = df_rent_plot.loc[(df_rent_plot.rent_type != "")&(df_rent_plot.district != "")&(df_rent_plot.post_year == '2019')].copy()
district_rent_r2019 = district_rent_rate[[ 'district', 'rent_type', 'room_rent']].groupby(['district', 'rent_type']).count().reset_index()
district_rent_r2019 = district_rent_r2019.sort_values('rent_type', ascending=False)
In [33]:
# fig = tools.make_subplots(rows = 2, cols = 3)
district_list = ["南山","福田","宝安", "罗湖", "龙岗","龙华"]
data = list()
anns = list()
for i, district in enumerate(district_list):
    df = district_rent_r2019.loc[district_rent_r2019.district == district]
    data.append(go.Pie(labels=df.rent_type.tolist(), values=df.room_rent.tolist(), name=district, hoverinfo='label+percent', textinfo='percent', 
               textfont=dict(size=10), hole=0.4, domain=dict(x=[0.33*(i%3),0.33*(i%3+1)],y=[0.4*(int(i/3)), 0.4*(int(i/3+1))-0.05])))
    anns.append(dict(font=dict(size=20),
            text=district,
            showarrow=False,
            #print_grid=False,
            # Specify text position (place text in a hole of pie)
            x= 0.33*(i%3),
            y=0.4*(int(i/3)+1)-0.05
            ))
layout = go.Layout(title ='2019年豆瓣深圳租房团各区贴子数租寻占比',
                   annotations=anns,
                   #print_grid=False,
                   height = 600,
                   width = 960
                   # Hide legend if you want
                   #showlegend=False
                   )
fig = go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)

豆瓣深圳租房团各区租寻比走势

In [34]:
district_rent = df_rent_plot.loc[(df_rent_plot.rent_type.isin(['招租','转租']))& (df_rent_plot.district != "")].copy()
district_look = df_rent_plot.loc[(df_rent_plot.rent_type.isin(['求租']))& (df_rent_plot.district != "")]
district_look_ym = district_rent[['year_month', 'district', 'room_rent']].groupby(['year_month', 'district']).median().reset_index()
In [35]:
district_look_ym.rename(columns={'room_rent': 'look_rent'}, inplace=True)
district_rent = district_rent.merge(district_look_ym, how='left', on=['year_month','district'])
In [36]:
district_rent['is_match'] = district_rent[['room_rent','look_rent']].apply(lambda x: np.abs(x['room_rent']- x['look_rent'])/x['look_rent'] < 0.2, axis=1)
In [37]:
# district_rent = district_rent.loc[district_rent.is_match==True]
district_look_ym = district_look[['year_month', 'district', 'room_rent']].groupby(['year_month', 'district']).count().reset_index()
district_look_ym.rename(columns={'room_rent': 'look_rent'}, inplace=True)
# district_rent = district_rent.loc[district_rent.is_match==True]
district_rent_ym = district_rent[['year_month', 'district', 'room_rent']].groupby(['year_month', 'district']).count().reset_index()
# district_rent_ym.head()
In [38]:
district_rent_ym = district_rent_ym.merge(district_look_ym, how='left', on=['year_month','district'])
district_rent_ym['rent_look_rate'] = (district_rent_ym['room_rent']/district_rent_ym['look_rent']).apply(lambda x: np.round(x,1))
In [39]:
district_rent_ym.loc[district_rent_ym.rent_look_rate.isin([np.inf,-np.inf]),'rent_look_rate'] = np.nan
district_rent_ym.loc[district_rent_ym.rent_look_rate >= 40 ,'rent_look_rate'] = 40
In [40]:
res_district_rent = get_block_res(district_rent_ym, block='district', time_col='year_month', y_col="rent_look_rate")
In [41]:
line = graph_lines_rent(res_district_rent,**{'title': "豆瓣深圳租房团各区租寻比走势",'is_label_show': False, 'xaxis_rotate':30})
line
Out[41]:

豆瓣深圳租房团地铁站附近房租

In [42]:
district_rent = df_rent_plot.loc[(df_rent_plot.rent_type.isin(['招租','转租']))& (df_rent_plot.metro_station != "")]
metro_rent_year = pd.pivot_table(district_rent, index='metro_station',columns='post_year', values='room_rent', aggfunc= filter_mean)
In [43]:
metro_rent_year.index.name = ""
metro_rent_year.columns.name = ""
columns_list = list(metro_rent_year.columns)
for row_name, row_val in metro_rent_year.iterrows():
    row_mean = np.mean(row_val)
    for i, col in enumerate(columns_list[:-1]):
        col_next = columns_list[i+1]
        delta = np.abs(row_val[col] - row_val[col_next])/row_mean
        if np.isnan(delta):
            continue
        else:
            if delta > 0.5:
                if np.abs(row_val[col] - row_mean) > np.abs(row_val[col_next] - row_mean):
                    metro_rent_year.loc[row_name, col] = np.nan
                else:
                    metro_rent_year.loc[row_name, col_next] = np.nan
In [44]:
metro_rent_year = metro_rent_year.sort_values(by="2019", ascending=False)
metro_rent_year['order']= list(range(1, metro_rent_year.shape[0] + 1))
metro_rent_year['metro_station']= metro_rent_year.index
metro_rent_year.index = list(range(1, metro_rent_year.shape[0] + 1))
metro_rent_year = metro_rent_year[list(metro_rent_year.columns[-2:])+list(metro_rent_year.columns[0:-2])]
# metro_rent_year.head()
In [45]:
district_rent = df_rent_plot.loc[(df_rent_plot.rent_type.isin(['招租','转租']))& (df_rent_plot.metro_station != "")]
metro_cnt_year = pd.pivot_table(district_rent, index='metro_station',columns='post_year', values='room_rent', aggfunc= len)
In [46]:
metro_cnt_year = metro_cnt_year.sort_values(by="2019", ascending=False)
In [47]:
metro_cnt_year.index.name = ""
metro_cnt_year.columns.name = ""
metro_cnt_year['order']= list(range(1, metro_cnt_year.shape[0] + 1))
metro_cnt_year['metro_station']= metro_cnt_year.index
metro_cnt_year.index = list(range(1, metro_cnt_year.shape[0] + 1))
metro_cnt_year = metro_cnt_year[list(metro_cnt_year.columns[-2:])+list(metro_cnt_year.columns[0:-2])]
# metro_cnt_year.head()
In [48]:
import plotly
import plotly.plotly as py
from plotly.offline import init_notebook_mode
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()
In [49]:
def table_chart(df,title):
    header_values = []
    for col in df.columns:
        if col == "metro_station":
            n_col = ["<b>" + "地铁站" + "</b>"]
        elif col== "order":
            n_col = ["<b>" + "" + "</b>"]
        else:
            n_col = ["<b>" + col + "</b>"]
        header_values.append(n_col)
    table_trace2 = go.Table(
        domain=dict(x=[0, 1],
                    y=[0.56, 1]),  
        columnwidth=[1]+[3]*7,
        columnorder=list(range(8)),
        header = dict(height = 40,
                      values = header_values, 
                      line = dict(color='rgb(50, 50, 50)'),
                      align = ['left'] * 8,
                      font = dict(color=['rgb(45, 45, 45)'] * 8, size=12),
                      fill = dict(color='#d562be')),
        cells = dict(values = [df[k].tolist() for k in df.columns],
                     line = dict(color='#506784'),
                     align = ['left'] * 8,
                     font = dict(color=['rgb(40, 40, 40)'] * 8, size=[12]+[15]*7),
                     suffix=[None] * 8,
                     height = 27,
                     fill = dict(color=['rgb(255, 204, 255)','rgb(235, 193, 238)', 'rgba(228, 222, 249, 0.65)']))
    )


    x = list(df.columns[2:])
    trace_list  = [table_trace2]
    color_list = ["#ff0000", "#ff9900","#ffff00","#00ff00", "#00ffff",\
                      "#4a86e8", "#0000ff", "#9900ff", "#ff00ff",  "#7f6000",\
                    "#006600", "#0000cc", "#996633", "#660066"]
    for i in range(0,12):
        metro = df.iloc[i].tolist()[1]
        y = df.iloc[i].tolist()[2:]
        trace=go.Scatter(
        x=x,
        y=y,
        xaxis='x1',
        yaxis='y1',
        mode='lines',
        line=dict(width=2, color=color_list[i]),
        name=metro
        )
        trace_list.append(trace)

    axis=dict(
        showline=True,
        zeroline=False,
        showgrid=True,
        mirror=True, 
        ticklen=4, 
        gridcolor='#ffffff',
        tickfont=dict(size=10)
    )

    layout = dict(
        width=980,
        height=800,
        autosize=False,
        title='<b>%s</b>'%title,
        titlefont=dict(size=20, color="#000000"),
        margin = dict(t=50),
        showlegend=True, 
        legend=dict(x=1.01, y=0.1),
        xaxis1=dict(axis, **dict(domain=[0, 1], anchor='y1', showticklabels=True)),
        yaxis1=dict(axis, **dict(domain=[0.0, 0.5], anchor='x1',hoverformat='d')),  
        plot_bgcolor='rgba(228, 222, 249, 0.65)'
    )
    fig = dict(data=trace_list, layout=layout)
    plotly.offline.iplot(fig)
In [50]:
table_chart(metro_rent_year,title="2014年到2019年豆瓣深圳租房团地铁站附近平均单房房租及年趋势")
In [51]:
table_chart(metro_cnt_year, title="2014年到2019年(截止0615)豆瓣深圳租房团各地铁站附近相关租房帖子数及年趋势")
In [52]:
district_metro_map = df_rent_plot.drop_duplicates(['metro_station'])[['metro_station', 'district']]
In [53]:
metro_rent_year['rate'] = ((metro_rent_year['2019'] - metro_rent_year['2018'])/metro_rent_year['2018']).apply(lambda x: np.round(x, 3))
metro_rent_2019 = metro_rent_year[['metro_station', '2019', 'rate']].copy()
metro_rent_2019 = metro_rent_2019.merge(district_metro_map, how="left", on=['metro_station'])
In [54]:
metro_cnt_2019 = metro_cnt_year[['metro_station', '2019']]
In [55]:
metro_rent_ns_2019 = metro_rent_2019.loc[metro_rent_2019.district == "南山"]
In [56]:
data = []
visible_list=[True, False,False,False,False,False]
for i, district in enumerate(district_list):
    df = metro_rent_2019.loc[metro_rent_2019.district == district]
    trace1 = go.Scatter(
        x=df['metro_station'],
        y=df['rate'],
        name='同比涨幅',
        yaxis = 'y2',
        mode='lines',
        visible=visible_list[i]
    )
    trace2 = go.Bar(
        x=df['metro_station'],
        y=df['2019'],
        name='2019年平均单房房租',
        marker = dict(color = 'blue', opacity=1),
        visible=visible_list[i]
    )
    data.append(trace2)
    data.append(trace1)

update_menus = \
[
    dict(active=0,
         x=0.02,
         y=1.15,
         buttons=[   
             dict(label = district_list[0],
                 method = 'update',
                 args = [{'visible': [True]*2 + [False]*10},
                         {'title': "2019年豆瓣深圳租房团{0}区地铁站周围平均单房房租及同比涨幅".format(district_list[0])}]),

             dict(label = district_list[1],
                 method = 'update',
                 args = [{'visible': [False]*2 + [True]*2 + [False]*8},
                         {'title': "2019年豆瓣深圳租房团{0}区地铁站周围平均单房房租及同比涨幅".format(district_list[1])}]),
              dict(label = district_list[2],
                 method = 'update',
                 args = [{'visible': [False]*4 + [True]*2 + [False]*6},
                         {'title': "2019年豆瓣深圳租房团{0}区地铁站周围平均单房房租及同比涨幅".format(district_list[2])}]),

              dict(label = district_list[3],
                 method = 'update',
                 args = [{'visible': [False]*6 + [True]*2 + [False]*4},
                         {'title': "2019年豆瓣深圳租房团{0}区地铁站周围平均单房房租及同比涨幅".format(district_list[3])}]),

              dict(label = district_list[4],
                 method = 'update',
                 args = [{'visible': [False]*8 + [True]*2 + [False]*2},
                         {'title': "2019年豆瓣深圳租房团{0}区地铁站周围平均单房房租及同比涨幅".format(district_list[4])}]),
             
             dict(label = district_list[5],
                 method = 'update',
                 args = [{'visible': [False]*10 + [True]*2},
                         {'title': "2019年豆瓣深圳租房团{0}区地铁站周围平均单房房租及同比涨幅".format(district_list[4])}])
        ],
    )
]

layout = dict(  
         title = "2019年豆瓣深圳租房团南山区地铁站周围平均单房房租及同比涨幅",
         yaxis = dict(side = 'left',showgrid = False, ticksuffix="元" ,zeroline = False),
         yaxis2 = dict(side = 'right', tickformat= "%",overlaying = "y" , showgrid = False, zeroline = False),
         legend=dict(x=1.05, y=0.8),
         #updatemenudefaults=update_menus,
         updatemenus = update_menus)
fig = dict(data=data, layout=layout)

plotly.offline.iplot(fig)

地铁分布

In [57]:
line_rent = df_rent_plot.loc[(df_rent_plot.metro_station != "")& (df_rent_plot.post_year=='2019')]
In [58]:
line_rent = line_rent[['line', 'room_rent']]
line_rent_series = line_rent['line'].str.split(r'\|', expand=True).stack().\
                    reset_index(level=1, drop=True).rename('line')
line_rent = line_rent.drop("line", axis=1).join(line_rent_series)
line_rent['line'] = line_rent['line'] + '号线'
In [59]:
line_rent_count = pd.DataFrame(line_rent.line.value_counts())
# line_rent_count.head(5)
In [60]:
trace = go.Bar(
        x=line_rent_count.index,
        y=line_rent_count.line,
        name='帖子数',
        marker = dict(color = 'blue', opacity=1))
layout = dict(  
         title = "2019年豆瓣深圳租房团各地铁线附近租房帖子数分布",
         )
fig = dict(data=[trace], layout=layout)

plotly.offline.iplot(fig)
In [61]:
import math

PI = math.pi

def _transformlat(coordinates):
    lng = coordinates[ : , 0] - 105
    lat = coordinates[ : , 1] - 35
    ret = -100 + 2 * lng + 3 * lat + 0.2 * lat * lat + \
          0.1 * lng * lat + 0.2 * np.sqrt(np.fabs(lng))
    ret += (20 * np.sin(6 * lng * PI) + 20 *
            np.sin(2 * lng * PI)) * 2 / 3
    ret += (20 * np.sin(lat * PI) + 40 *
            np.sin(lat / 3 * PI)) * 2 / 3
    ret += (160 * np.sin(lat / 12 * PI) + 320 *
            np.sin(lat * PI / 30.0)) * 2 / 3
    return ret


def _transformlng(coordinates):
    lng = coordinates[ : , 0] - 105
    lat = coordinates[ : , 1] - 35
    ret = 300 + lng + 2 * lat + 0.1 * lng * lng + \
          0.1 * lng * lat + 0.1 * np.sqrt(np.fabs(lng))
    ret += (20 * np.sin(6 * lng * PI) + 20 *
            np.sin(2 * lng * PI)) * 2 / 3
    ret += (20 * np.sin(lng * PI) + 40 *
            np.sin(lng / 3 * PI)) * 2 / 3
    ret += (150 * np.sin(lng / 12 * PI) + 300 *
            np.sin(lng / 30 * PI)) * 2 / 3
    return ret


def gcj02_to_wgs84(coordinates):
    """
    GCJ-02转WGS-84
    :param coordinates: GCJ-02坐标系的经度和纬度的numpy数组
    :returns: WGS-84坐标系的经度和纬度的numpy数组
    """
    ee = 0.006693421622965943  # 偏心率平方
    a = 6378245  # 长半轴
    lng = coordinates[ : , 0]
    lat = coordinates[ : , 1]
    is_in_china= (lng > 73.66) & (lng < 135.05) & (lat > 3.86) & (lat < 53.55)
    _transform = coordinates[is_in_china]  #只对国内的坐标做偏移
    
    dlat = _transformlat(_transform)
    dlng = _transformlng(_transform)
    radlat = _transform[ : , 1] / 180 * PI
    magic = np.sin(radlat)
    magic = 1 - ee * magic * magic
    sqrtmagic = np.sqrt(magic)
    dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * PI)
    dlng = (dlng * 180.0) / (a / sqrtmagic * np.cos(radlat) * PI)
    mglat = _transform[ : , 1] + dlat
    mglng = _transform[ : , 0] + dlng
    coordinates[is_in_china] = np.array([
        _transform[ : , 0] * 2 - mglng, _transform[ : , 1] * 2 - mglat
    ]).T
    return coordinates


def bd09_to_gcj02(coordinates):
    """
    BD-09转GCJ-02
    :param coordinates: BD-09坐标系的经度和纬度的numpy数组
    :returns: GCJ-02坐标系的经度和纬度的numpy数组
    """
    x_pi = PI * 3000 / 180
    x = coordinates[ : , 0] - 0.0065
    y = coordinates[ : , 1] - 0.006
    z = np.sqrt(x * x + y * y) - 0.00002 * np.sin(y * x_pi)
    theta = np.arctan2(y, x) - 0.000003 * np.cos(x * x_pi)
    lng = z * np.cos(theta)
    lat = z * np.sin(theta)
    coordinates = np.array([lng, lat]).T
    return coordinates


def bd09_to_wgs84(coordinates):
    """
    BD-09转WGS-84
    :param coordinates: BD-09坐标系的经度和纬度的numpy数组
    :returns: WGS-84坐标系的经度和纬度的numpy数组
    """
    return gcj02_to_wgs84(bd09_to_gcj02(coordinates))


def mercator_to_bd09(mercator):
    """
    BD-09MC转BD-09
    :param coordinates: GCJ-02坐标系的经度和纬度的numpy数组
    :returns: WGS-84坐标系的经度和纬度的numpy数组
    """
    MCBAND = [12890594.86, 8362377.87, 5591021, 3481989.83, 1678043.12, 0]
    MC2LL = [[1.410526172116255e-08,   8.98305509648872e-06,    -1.9939833816331,        
              200.9824383106796,       -187.2403703815547,      91.6087516669843,
              -23.38765649603339,      2.57121317296198,        -0.03801003308653,
              17337981.2],
            [-7.435856389565537e-09,  8.983055097726239e-06,   -0.78625201886289,
             96.32687599759846,       -1.85204757529826,       -59.36935905485877,
             47.40033549296737,       -16.50741931063887,      2.28786674699375,
             10260144.86],
            [-3.030883460898826e-08,  8.98305509983578e-06,    0.30071316287616,
             59.74293618442277,       7.357984074871,          -25.38371002664745,
             13.45380521110908,       -3.29883767235584,       0.32710905363475,
             6856817.37],
            [-1.981981304930552e-08,  8.983055099779535e-06,   0.03278182852591,
             40.31678527705744,       0.65659298677277,        -4.44255534477492,
             0.85341911805263,        0.12923347998204,        -0.04625736007561,
             4482777.06], 
            [3.09191371068437e-09,    8.983055096812155e-06,   6.995724062e-05,
             23.10934304144901,       -0.00023663490511,       -0.6321817810242,
             -0.00663494467273,       0.03430082397953,        -0.00466043876332,
             2555164.4],  
            [2.890871144776878e-09,   8.983055095805407e-06,   -3.068298e-08,
             7.47137025468032,        -3.53937994e-06,         -0.02145144861037,
             -1.234426596e-05,        0.00010322952773,        -3.23890364e-06,
             826088.5]] 
    
    x = np.abs(mercator[ : , 0])
    y = np.abs(mercator[ : , 1])
    coef = np.array([
           MC2LL[index] for index in 
           (np.tile(y.reshape((-1, 1)), (1, 6)) < MCBAND).sum(axis=1)
    ])   
    return converter(x, y, coef)


def converter(x, y, coef):
    x_temp = coef[ : ,0] + coef[ : ,1] * np.abs(x)
    x_n = np.abs(y) / coef[ : ,9]
    y_temp = coef[ : ,2] + coef[ : ,3] * x_n + coef[ : ,4] * x_n ** 2 + \
             coef[ : ,5] * x_n ** 3 + coef[ : ,6] * x_n ** 4 + coef[ : ,7] * x_n ** 5 + \
             coef[ : ,8] * x_n ** 6
    x[x < 0] = -1
    x[x >= 0] = 1
    y[y < 0] = -1
    y[y >= 0] = 1    
    x_temp *= x
    y_temp *= y
    coordinates = np.array([x_temp, y_temp]).T
    return coordinates
In [62]:
import requests
In [63]:
url = "http://map.baidu.com/?qt=bsi&c=340"
headers  = {
         "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
         "Accept-Encoding": "gzip, deflate, sdch",
        }
s = requests.get(url, headers=headers)
In [64]:
s_content = eval(s.content)
In [65]:
#bd09_to_wgs84(mercator_to_bd09(np.array([[12706537.33, 2564680.52]]))).tolist()
In [66]:
line_name_list = []
line_uid_list = []
stops_list = []
xys_list = []
for line in s_content['content']:
    line_name = re.match('(\d+号线).*',line['line_name']).group(1)
    if line_name in line_name_list:
        continue
    line_name_list.append(line_name)
    line_uid_list.append(line['line_uid'])
    stop_list = []
    xy_list = []
    for stop in line['stops']:
        stop_list.append(stop['name']+'站')
        xy_list.append([stop['x'], stop['y']])
    stops_list.append(stop_list)
    xys_list.append(bd09_to_wgs84(mercator_to_bd09(np.array(xy_list))))
In [67]:
import json
class MyEncoder(json.JSONEncoder):
 
    def default(self, obj):
        """
        只要检查到了是bytes类型的数据就把它转为str类型
        :param obj:
        :return:
        """
        if isinstance(obj, bytes):
            return str(obj, encoding='utf-8')
        return json.JSONEncoder.default(self, obj)
In [68]:
import time
In [69]:
line_color_list = []
trace_list = []
for line_uid in line_uid_list:
    url = "https://map.baidu.com/?qt=bsl&tps=&newmap=1&uid={0}&c=340".format(line_uid)
    s = requests.get(url, headers=headers)
    s_content =json.loads(s.content, encoding=MyEncoder)
    line_color_list.append(s_content['content'][0]['line_color'])
    trace_mercator = np.array(s_content['content'][0]['geo'].split('|')[2][:-1].split(','), dtype=float).reshape((-1, 2))
    trace_list.append(bd09_to_wgs84(mercator_to_bd09(trace_mercator)))
    time.sleep(1)
In [70]:
line_info = {'line_name': line_name_list, 'metro_stations': stops_list, 'mst_coords': xys_list, 'line_color': line_color_list, 'trace_coords': trace_list}
In [71]:
df_line_info = pd.DataFrame(line_info)
In [72]:
#df_line_info.head()
In [73]:
dfs  = list()
for i, row in df_line_info.iterrows():
    mst_list = row['metro_stations']
    mstc_list = row['mst_coords']
    line_name = [row['line_name']]*len(mstc_list)
    line_color = [row['line_color']]*len(mstc_list)
    df = pd.DataFrame(dict(metro_station=mst_list, mst_coord=mstc_list.tolist(), line_name=line_name, line_color=line_color))
    dfs.append(df)
df_metro_info = pd.concat(dfs, axis=0)    
In [74]:
df_metro_info = df_metro_info.merge(metro_cnt_2019, how='left', on='metro_station')

#df_metro_info.head()
In [75]:
df_metro_info.fillna(0, inplace=True)
In [76]:
df_metro_info['size_r'] = df_metro_info['2019'].apply(lambda x: np.round(np.sqrt(x+1)/1.5 + 2,2))
df_metro_info['hoverinfo'] = df_metro_info['metro_station'] + ',' +  df_metro_info['2019'].apply(lambda x: str(int(x)))
In [77]:
token = "pk.eyJ1IjoiaHVhbmdzaGl5YW5nIiwiYSI6ImNqeTZwaWJ3MjBrcXMzbW9id2tobDRubHMifQ.zdUbKs85QiEUquPMdjleGA"
data = []
layout = go.Layout(
    title='2019年豆瓣深圳租房团租房贴子数地铁线路分布图',
    width =960,
    height = 600,
    autosize=True,
    mapbox=dict(
        accesstoken=token,
        style = 'mapbox://styles/huangshiyang/cjy6xtpl40yxw1cohcdfdtaoq',
        bearing=0,
        center=dict(
            lat=22.5427758091, #广州市纬度
            lon= 114.0579807461 #广州市经度
        ),
        pitch=0,
        zoom=10
    ),
)

for i, row in df_line_info.iterrows():
    line_name = row['line_name']
    metro_stations = df_metro_info.loc[df_metro_info.line_name==line_name].metro_station.tolist()
    station_coordinates = np.array(df_metro_info.loc[df_metro_info.line_name==line_name].mst_coord.tolist())
    size = df_metro_info.loc[df_metro_info.line_name==line_name].size_r.tolist()
    trace_coordinates = row['trace_coords']
    metros = row['metro_stations']
    hoverinfo = df_metro_info.loc[df_metro_info.line_name==line_name].hoverinfo.tolist()

    color = row['line_color']
    data.append(go.Scattermapbox(
        lon=trace_coordinates[:, 0], #路线点经度
        lat=trace_coordinates[:, 1], #路线点纬度
        mode='lines',
        # 设置路线的参数
        hoverinfo='none',
        line=go.scattermapbox.Line(
            width=2,
            color=color
        ),         
        name=line_name, #线路名称,显示在图例(legend)上
        legendgroup=line_name
    ))
    data.append(go.Scattermapbox(
            lon=station_coordinates[:, 0], #站台经度
            lat=station_coordinates[:, 1], #站台纬度
            mode='markers+text',   
            text=metros,
            textfont=dict(size=8),
#             hvo=1,
            hovertext=hoverinfo,
            hoverinfo='text',
#             hoverlabel=metros,
            # 设置标记点的参数
            marker=go.scattermapbox.Marker(
                size=size,
                color=color,
                #symbol='gift',
            ),
            opacity=0.6,
            name=line_name, #线路名称,显示在图例(legend)及鼠标悬浮在标记点时的路线名上
            legendgroup=line_name, #设置与路线同组,当隐藏该路线时隐藏标记点
            showlegend=False #不显示图例(legend)
    ))
    
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

限女生分析

In [78]:
df_rent_for = df_rent_plot.loc[df_rent_plot.rent_type.isin(['招租','转租'])& (df_rent_plot.district != "") & (df_rent_plot.post_year >= '2014')]
pivot_rent_for = pd.pivot_table(df_rent_for, values='girl_only', index='district', columns='post_year', aggfunc=np.mean )
In [79]:
pivot_rent_for = pivot_rent_for.applymap(lambda x: str(np.round(x*100, 1))+'%')
In [80]:
pivot_rent_for['地区'] = pivot_rent_for.index
# pivot_rent_for.head()
In [81]:
pivot_rent_for = pivot_rent_for[['地区','2014', '2015', '2016', '2017', '2018', '2019']]
In [82]:
headers = list(pivot_rent_for.columns)
data = [go.Table(header=dict(values=headers, font=dict(size=10), align='left', fill = dict(color='#d562be')),
                 cells=dict(values=[pivot_rent_for[k].tolist() for k in pivot_rent_for.columns],align='left'))]
layout = dict(title="2014年到2019(截止0615)豆瓣深圳租房团限女生帖子数占比",height=400)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [83]:
marked = (df_rent_plot.rent_type.isin(['招租','转租']))& (df_rent_plot.district != "")& (pd.notna(df_rent_plot.room_rent))&(df_rent_plot.girl_only==1)
district_rent = df_rent_plot.loc[marked]
district_rent_year_girl = district_rent[['post_year', 'district', 'room_rent']].groupby(['post_year', 'district']).agg({'room_rent': filter_mean}).reset_index()
district_rent_year_girl = district_rent_year_girl.sort_values(by='post_year', ascending=True)
# district_rent_year['room_rent'] = district_rent_year['room_rent'].apply(lambda x: int(x))
In [84]:
marked = (df_rent_plot.rent_type.isin(['招租','转租']))& (df_rent_plot.district != "")& (pd.notna(df_rent_plot.room_rent))
district_rent = df_rent_plot.loc[marked]
district_rent_year = district_rent[['post_year', 'district', 'room_rent']].groupby(['post_year', 'district']).agg({'room_rent': filter_mean}).reset_index()
district_rent_year = district_rent_year.sort_values(by='post_year', ascending=True)
In [85]:
district_rent_year_girl.rename(columns={'room_rent': 'room_rent2'}, inplace=True)
district_rent_year = district_rent_year.merge(district_rent_year_girl, how='left', on=['post_year', 'district'])
In [86]:
district_rent_year = district_rent_year.sort_values(by="room_rent", ascending=False)
In [87]:
year_list = sorted(district_rent_year.post_year.unique())[::-1]
In [88]:
data = []
visible_list=[True, False,False,False,False,False]
for i, post_year in enumerate(year_list):
    df = district_rent_year.loc[district_rent_year.post_year==post_year]
    trace1 = go.Bar(
        x=df['district'],
        y=df['room_rent'],
        name='平均单房月租',
        marker = dict(color = 'blue', opacity=1),
        visible=visible_list[i]
    )
    trace2 = go.Bar(
        x=df['district'],
        y=df['room_rent2'],
        name='限女生帖子平均单房月租',
        yaxis='y',
        marker = dict(color = 'purple', opacity=1),
        visible=visible_list[i]
    )
    data.append(trace1)
    data.append(trace2)

update_menus = \
[
    dict(active=0,
         x=0.02,
         y=1.15,
         buttons=[   
             dict(label = year_list[0],
                 method = 'update',
                 args = [{'visible': [True]*2 + [False]*10},
                         {'title': "{0}年豆瓣深圳租房团限女生帖与整体平均单房房租对比".format(year_list[0])}]),

             dict(label = year_list[1],
                 method = 'update',
                 args = [{'visible': [False]*2 + [True]*2 + [False]*8},
                         {'title': "{0}年豆瓣深圳租房团限女生帖与整体平均单房房租对比".format(year_list[1])}]),
              dict(label = year_list[2],
                 method = 'update',
                 args = [{'visible': [False]*4 + [True]*2 + [False]*6},
                         {'title': "{0}年豆瓣深圳租房团限女生帖与整体平均单房房租对比".format(year_list[2])}]),

              dict(label = year_list[3],
                 method = 'update',
                 args = [{'visible': [False]*6 + [True]*2 + [False]*4},
                         {'title': "{0}年豆瓣深圳租房团限女生帖与整体平均单房房租对比".format(year_list[3])}]),

              dict(label = year_list[4],
                 method = 'update',
                 args = [{'visible': [False]*8 + [True]*2 + [False]*2},
                         {'title': "{0}年豆瓣深圳租房团限女生帖与整体平均单房房租对比".format(year_list[4])}]),
             
             dict(label = year_list[5],
                 method = 'update',
                 args = [{'visible': [False]*10 + [True]*2},
                         {'title': "{0}年豆瓣深圳租房团限女生帖与整体平均单房房租对比".format(year_list[5])}])
        ],
    )
]

layout = dict(  
         title = "2019年豆瓣深圳租房团限女生帖与全部帖平均单房房租对比",
         yaxis = dict(side = 'left' ,ticksuffix="元",zeroline = False),
         #yaxis2 = dict(side = 'right', tickformat= "%",overlaying = "y" , showgrid = False, zeroline = False),
         legend=dict(x=1.05, y=0.8),
         #updatemenudefaults=update_menus,
         updatemenus = update_menus)
fig = dict(data=data, layout=layout)

plotly.offline.iplot(fig)
In [89]:
from IPython.display import HTML
# %%html
# <script> $('div.input').hide()</script>
# <style>div.prompt{display:none} </style>
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 $('div.prompt').hide();
 } else {
 $('div.input').show();
 $('div.prompt').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="toggle on/off code"></form>''')
Out[89]: